In this section we’ll continue using CRC dataset.
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
CRC <- read.csv("./data/CRC_train.csv")
NCI60 <- read.csv("./data/NCI60.csv")
names(NCI60)
## [1] "BR_BT549_a" "BR_BT549_b" "BR_HS578T_a" "BR_HS578T_b"
## [5] "BR_MCF7_a" "BR_MCF7_b" "BR_MDAMB231_a" "BR_MDAMB231_b"
## [9] "BR_MDAMB468_a" "BR_MDAMB468_b" "BR_T47D_a" "BR_T47D_b"
## [13] "CNS_SF268_a" "CNS_SF268_b" "CNS_SF295_a" "CNS_SF295_b"
## [17] "CNS_SF539_a" "CNS_SF539_b" "CNS_SNB19_a" "CNS_SNB19_b"
## [21] "CNS_SNB75_a" "CNS_SNB75_b" "CNS_U251_a" "CNS_U251_b"
## [25] "CO_COLO205_a" "CO_COLO205_b" "CO_HCC2998_a" "CO_HCC2998_b"
## [29] "CO_HCT116_a" "CO_HCT116_b" "CO_HCT15_a" "CO_HCT15_b"
## [33] "CO_HT29_a" "CO_HT29_b" "CO_KM12_a" "CO_KM12_b"
## [37] "CO_SW620_a" "CO_SW620_b" "LC_A549_a" "LC_A549_b"
## [41] "LC_EKVX_a" "LC_EKVX_b" "LC_HOP62_a" "LC_HOP62_b"
## [45] "LC_HOP92_a" "LC_HOP92_b" "LC_NCIH226_a" "LC_NCIH226_b"
## [49] "LC_NCIH23_a" "LC_NCIH23_b" "LC_NCIH322M_a" "LC_NCIH322M_b"
## [53] "LC_NCIH460_a" "LC_NCIH460_b" "LC_NCIH522_a" "LC_NCIH522_b"
## [57] "LE_CCRFCEM_a" "LE_CCRFCEM_b" "LE_HL60_a" "LE_HL60_b"
## [61] "LE_K562_a" "LE_K562_b" "LE_MOLT4_a" "LE_MOLT4_b"
## [65] "LE_RPMI8226_a" "LE_RPMI8226_b" "LE_SR_a" "LE_SR_b"
## [69] "ME_LOXIMVI_a" "ME_LOXIMVI_b" "ME_M14_a" "ME_M14_b"
## [73] "ME_MALME3M_a" "ME_MALME3M_b" "ME_MDAMB435_a" "ME_MDAMB435_b"
## [77] "ME_SKMEL2_a" "ME_SKMEL2_b" "ME_SKMEL28_a" "ME_SKMEL28_b"
## [81] "ME_SKMEL5_a" "ME_SKMEL5_b" "ME_UACC257_a" "ME_UACC257_b"
## [85] "ME_UACC62_a" "ME_UACC62_b" "OV_IGROV1_a" "OV_IGROV1_b"
## [89] "OV_NCIADRRES_a" "OV_NCIADRRES_b" "OV_OVCAR3_a" "OV_OVCAR3_b"
## [93] "OV_OVCAR4_a" "OV_OVCAR4_b" "OV_OVCAR5_a" "OV_OVCAR5_b"
## [97] "OV_OVCAR8_a" "OV_OVCAR8_b" "OV_SKOV3_a" "OV_SKOV3_b"
## [101] "PR_DU145_a" "PR_DU145_b" "PR_PC3_a" "PR_PC3_b"
## [105] "RE_7860_a" "RE_7860_b" "RE_A498_a" "RE_A498_b"
## [109] "RE_ACHN_a" "RE_ACHN_b" "RE_CAKI1_a" "RE_CAKI1_b"
## [113] "RE_RXF393_a" "RE_RXF393_b" "RE_SN12C_a" "RE_SN12C_b"
## [117] "RE_TK10_a" "RE_TK10_b" "RE_UO31_a" "RE_UO31_b"
## [121] "Protein"
## 2.1 Deal with overplotting
# Basic scatter plot
p <- ggplot(CRC, aes(x = SERPINA3, y = TIMP1))
p + geom_point()
# moves each point by a small, random amount
p + geom_jitter(width = 0.25)
# For larger datasets
s <- ggplot(NCI60, aes(BR_BT549_a, BR_HS578T_a))
s + geom_point()
# Change the shape from solid to hollow circles
s + geom_point(shape = 1)
# Pixel sized
s + geom_point(shape = ".")
# use alpha blending (transparency) to make the points transparent
# If you specify alpha as a ratio, the denominator gives the number of points that must be overplotted to give a solid colour.
s + geom_point(alpha = 1 / 3)
s + geom_point(alpha = 1 / 5)
s + geom_point(alpha = 1 / 10)
## 2.2 Order bar chart
g <- ggplot(CRC, aes(Sub_group))
g + geom_bar()
subgroup <- CRC %>% group_by(Sub_group) %>% summarise(n = n()) # count the number of samples for each sub group
subgroup
## # A tibble: 3 × 2
## Sub_group n
## <fctr> <int>
## 1 Benign 34
## 2 CRC 100
## 3 Healthy 66
subgroup <- subgroup[order(subgroup$n), ] # sort
subgroup$Sub_group <- factor(subgroup$Sub_group, levels = subgroup$Sub_group) # to retain the order in plot.
subgroup
## # A tibble: 3 × 2
## Sub_group n
## <fctr> <int>
## 1 Benign 34
## 2 Healthy 66
## 3 CRC 100
ggplot(subgroup, aes(x=Sub_group, y=n)) +
geom_bar(stat="identity")
## 2.3 Zooming
# Change the limit of x-axis and y-axis
h <- ggplot(CRC, aes(SERPINA3))
# Change the bar width
h + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
h + geom_histogram(binwidth = 0.1)
# Change the limit of x-axis and y-axis
h + geom_histogram(binwidth = 0.1) +
coord_cartesian(xlim = c(12.5, 16))
# Color specific point in scatter plot
ggplot(data = CRC) +
geom_point(mapping = aes(x = SERPINA3, y = TIMP1))
CRC2 <- CRC
highlight.sample <- "P1D2"
CRC2$highlight <- ifelse(CRC2$Sample == highlight.sample, "highlight", "normal")
textdf <- CRC2[CRC2$Sample == highlight.sample, ]
mycolours <- c("highlight" = "red", "normal" = "grey50")
textdf
## A1AG2 AFM AHSG AIAG.Bovine ANT3 AOC3 APOB
## 115 14.43676 15.68612 19.4506 15.06871 17.12417 9.772163 15.41505
## ATRN BTD C20orf3 CADM1 CD163 CD44 CDH5
## 115 13.33803 15.87152 10.65438 8.843735 11.22391 9.635675 8.581254
## CFH CFI CLU CP CTSD DKFZp686N02209 DSG2
## 115 17.39842 17.52245 19.63051 15.54674 9.689814 20.61501 10.41874
## ECM1 F11 F5 FCGBP FETUA.Bovine FETUB FGA
## 115 12.30604 15.25164 12.21818 12.18561 17.07109 13.92578 8.627747
## FGG FHR3 FN1 GOLM1 HP HRG HYOU1 ICAM1
## 115 10.34984 10.57234 12.41536 8.566989 20.7664 16.89356 8.362621 8.47747
## IGFBP3 IGHA2 IGHG2 ITIH4 KLKB1 KNG1 LAMP2 LCN2
## 115 9.71304 18.1794 21.37353 15.34769 13.75194 17.13578 14.36741 9.43319
## LGALS3BP LRG1 LUM LYVE1 MMRN1 MPO MRC2
## 115 14.05995 14.73259 15.30533 12.07749 11.40201 11.78676 10.82856
## MST1 NCAM1 ORM1 PGCP PIGR PLTP PLXDC2 PON1
## 115 12.66785 10.44025 16.78709 8.633163 8.614645 11.44895 9.736548 16.0177
## PRG4 PROC PTPRJ Q5JNX2 SERPINA1 SERPINA3 SERPINA6
## 115 12.33507 9.898844 9.300791 20.02495 16.39812 14.48573 15.47579
## SERPINA7 THBS1 TIMP1 TNC VTN VWF Sample Group
## 115 13.08085 15.1787 11.91644 10.15539 12.28754 10.89411 P1D2 Healthy
## Age Gender Cancer_stage Tumour_location Sub_group highlight
## 115 68 female NA <NA> Healthy highlight
ggplot(data = CRC2, aes(x = SERPINA3, y = TIMP1)) +
geom_point(size = 3, aes(colour = highlight)) +
scale_color_manual("Sample", values = mycolours) +
geom_text(data = textdf, aes(x = SERPINA3, y = TIMP1* 0.99, label = highlight.sample), colour = "red")
## 2.4 Change labels, themes, and scales
# 2.4.1 titles, subtitles and captions
p1 <- ggplot(data = CRC) +
geom_point(mapping = aes(x = SERPINA3, y = TIMP1, color = Sub_group))
p1 + labs(title = "Compare between sub groups",
subtitle = "Benign samples are mixed with the other two groups",
caption = "Data vis example")
# Axis labels and legend titles
p1 + labs(x = "Protein SERPINA3", y = "Protein TIMP1", color = "Sub groups")
# 2.4.2 Theme: Change appearance of non-data elements
p1 + theme_grey()
p1 + theme_classic()
p1 + theme_dark()
p1 + theme_light()
p1 + theme_void()
p1 + theme(panel.background = element_rect(fill = "white", colour = "grey50"))
CRC.two.prot <- CRC[,c("SERPINA3","TIMP1","Sample")]
plot.data <- CRC.two.prot[1:20,] %>% gather(Protein, Abundance, -Sample)
p2 <- ggplot(plot.data) +
geom_line(aes(x=Sample, y = Abundance, group = Protein, colour=Protein))
# Change the appearance and the orientation angle of axis labels
# Reference: http://ggplot2.tidyverse.org/reference/theme.html
p2 + theme(axis.text.x = element_text(face="bold", color="blue", size=10, angle=45),
axis.title.x = element_text(face="bold", colour="#990000", size=20),
axis.text.y = element_text(face="bold", color="blue", size=14),
axis.title.y = element_text(face="bold", colour="#990000", size=20))
# Hide x an y axis tick mark labels
p2 + theme(
axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.text.y = element_blank(),
axis.title.y = element_blank())
# Remove axis ticks and tick mark labels
p2 + theme(
axis.ticks = element_blank())
# Adjust Legend
p2 + theme(legend.position = "right") # the default
p2 + theme(legend.position = "bottom")
p2 + theme(legend.position = "none")
# Strips
# Facetting creates tables of graphics by splitting the data into subsets and displaying the same graph for each subset
p3 <- ggplot(CRC) +
geom_point(aes(x = SERPINA3, y = TIMP1)) +
facet_grid(~ Group)
p3
p3 + theme(strip.background = element_rect(colour = "white", fill = "yellow"))
# 2.4.3 Scales: control the appearance of data elements.
# A scale function for each aesthetic.
# change color themes
p1 + scale_color_grey()
p1 + scale_color_brewer(palette = "Set1")
# change the axis scales
p1 + scale_x_log10()
p1 + scale_y_log10()
p1 + scale_x_continuous(name="Protein SERPINA3", limits=c(12, 16)) +
scale_y_continuous(name="Protein TIMP1")
## Warning: Removed 15 rows containing missing values (geom_point).
Challenge Plot the scatter plot of the abundance of protein AFM and AHSG among all the samples and highlight samples P1A1, P1B1 and P1B12.
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
CRC2 <- CRC
highlight.sample <- c("P1A2", "P1B1","P1B12")
CRC2$highlight <- ifelse(CRC2$Sample %in% highlight.sample, "highlight", "normal")
textdf <- CRC2[CRC2$Sample %in% highlight.sample, ]
mycolours <- c("highlight" = "red", "normal" = "grey50")
ggplot(data = CRC2, aes(x = AFM, y = AHSG)) +
geom_point(size = 3, aes(colour = highlight))+
scale_color_manual("Sample", values = mycolours) +
geom_text(data = textdf, aes(x = AFM*0.99, y = AHSG, label = highlight.sample), colour = "red")
Challenge Zoom in the figure you just plot to AFM=15.5-16.2
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
\(~\)
ggplot(data = CRC2, aes(x = AFM, y = AHSG)) +
geom_point(size = 3, aes(colour = highlight))+
scale_color_manual("Sample", values = mycolours) +
geom_text(data = textdf, aes(x = AFM*0.995, y = AHSG, label = highlight.sample), colour = "red")+
coord_cartesian(xlim=c(15.5,16.2))